Running DESeq

1. Loading files from both algorithms

First STAR mappinngs passed through featureCounts
readcounts <- read.table("/Users/mar/Documents/GitHub/ANGSD_PROJECT_MOA4020/Project/STAR/featureCounts/Counts.txt", header=TRUE)

names(readcounts) <- gsub(".it2.Aligned.sortedByCoord.out.bam", "", gsub("X.athena.angsd.scratch.moa4020.project.GEO_Dataset.STAR_alignments_it2.alignment_it2_files.\\w+\\.","",names(readcounts)))

str(readcounts)
## 'data.frame':    49997 obs. of  14 variables:
##  $ Geneid   : chr  "TrnP" "TrnT" "CYTB" "TrnE" ...
##  $ Chr      : chr  "chrM" "chrM" "chrM" "chrM" ...
##  $ Start    : chr  "15356" "15289" "14145" "14071" ...
##  $ End      : chr  "15422" "15355" "15288" "14139" ...
##  $ Strand   : chr  "-" "+" "+" "-" ...
##  $ Length   : int  67 67 1144 69 519 1824 71 59 67 1378 ...
##  $ SPP1_KO_1: int  422 6 28865 18 10003 48134 37 141 47 21703 ...
##  $ SPP1_KO_2: int  445 1 56344 100 14229 59595 63 66 54 30777 ...
##  $ SPP1_KO_3: int  506 1 69231 97 17203 66883 68 41 120 36651 ...
##  $ SPP1_KO_4: int  590 0 51625 45 17455 71666 76 132 87 34256 ...
##  $ wt_1     : int  436 3 62797 40 13394 50629 62 64 61 32342 ...
##  $ wt_2     : int  517 1 78611 42 16649 70607 79 119 103 41027 ...
##  $ wt_3     : int  324 3 56111 21 11436 43486 55 54 75 26556 ...
##  $ wt_4     : int  360 5 65895 44 13828 50778 59 56 95 31629 ...

Making a new matrix with just the counts and the transcript IDs for the row names

length(unique(readcounts[,1]))
## [1] 49997
length(readcounts[,1])
## [1] 49997
grouped_readcounts <- readcounts[,7:14]

row.names(grouped_readcounts) <- readcounts[,1]

Saving sample info for reference

sample_info <- data.frame(condition = gsub("_[0-9]+", "", colnames(grouped_readcounts)), row.names = colnames(grouped_readcounts))

sample_info
##           condition
## SPP1_KO_1   SPP1_KO
## SPP1_KO_2   SPP1_KO
## SPP1_KO_3   SPP1_KO
## SPP1_KO_4   SPP1_KO
## wt_1             wt
## wt_2             wt
## wt_3             wt
## wt_4             wt
str(sample_info)
## 'data.frame':    8 obs. of  1 variable:
##  $ condition: chr  "SPP1_KO" "SPP1_KO" "SPP1_KO" "SPP1_KO" ...
Now Salmon counts
wd <- getwd()

filenames <- list.files(path = paste0(wd,"/Salmon"),recursive = TRUE,pattern="quant.genes.sf")
filepaths <- list.files(path = paste0(wd,"/Salmon"),recursive = TRUE,pattern="quant.genes.sf",full.names = TRUE)

# Copying transcript names from one file for salmonResultMatrix's row names
salmonTest <- read.csv(file=filepaths[8], sep = "\t")

salmonResultMatrix <- matrix(nrow=nrow(salmonTest),ncol=length(filenames)+1)
row.names(salmonResultMatrix) <- salmonTest[,1]

# iterate through all the quant.sf files and add the count values under each replicate
for(i in 1:length(filepaths)){
  salmonResultMatrix[,i+1] <- read.csv(file=filepaths[i], sep = "\t")[,5]
}

# Adding column names for salmonResultMatrix

colnames(salmonResultMatrix) <- c("Length",gsub("/quant.genes.sf","",filenames))

# Copying transcript length data from the test file
salmonResultMatrix[,1] <- salmonTest[,2]

# Copying just the count values to a new matrix - salmonCounts

salmonCounts <- salmonResultMatrix[,-1]

# Removing transcript version details from row names
rownames(salmonCounts) <- sub("\\..*", "", rownames(salmonCounts))

# Saving a reference of the condition for each sample
salmon_sample_info <- data.frame(condition = gsub("_[0-9]+", "", colnames(salmonCounts)), row.names = colnames(salmonCounts))

nrow(salmonCounts)
## [1] 49152
salmon_sample_info
##           condition
## SPP1_KO_1   SPP1_KO
## SPP1_KO_2   SPP1_KO
## SPP1_KO_3   SPP1_KO
## SPP1_KO_4   SPP1_KO
## wt_1             wt
## wt_2             wt
## wt_3             wt
## wt_4             wt
str(salmon_sample_info)
## 'data.frame':    8 obs. of  1 variable:
##  $ condition: chr  "SPP1_KO" "SPP1_KO" "SPP1_KO" "SPP1_KO" ...

Converting the read counts into integers and adding the row and column names

salmonCountData <- matrix(as.integer(ceiling(as.numeric(salmonCounts))), ncol = 8)
row.names(salmonCountData) <- row.names(salmonCounts)
colnames(salmonCountData) <- colnames(salmonCounts)

Creating a DESeq object from both the results

DESeq.ds <- DESeqDataSetFromMatrix(countData = grouped_readcounts, 
                                   colData = sample_info,
                                   design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.ds
## class: DESeqDataSet 
## dim: 49997 8 
## metadata(1): version
## assays(1): counts
## rownames(49997): TrnP TrnT ... Xkr4 Gm26206
## rowData names(0):
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(1): condition
salmon_DESeq.ds <- DESeqDataSetFromMatrix(countData = salmonCountData, 
                                   colData = salmon_sample_info,
                                   design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): 11 duplicate rownames
## were renamed by adding numbers
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
salmon_DESeq.ds
## class: DESeqDataSet 
## dim: 49152 8 
## metadata(1): version
## assays(1): counts
## rownames(49152): LOC100048823 LOC108168686 ... 1700067P10Rik Trem3
## rowData names(0):
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(1): condition
head(counts(DESeq.ds))
##      SPP1_KO_1 SPP1_KO_2 SPP1_KO_3 SPP1_KO_4  wt_1  wt_2  wt_3  wt_4
## TrnP       422       445       506       590   436   517   324   360
## TrnT         6         1         1         0     3     1     3     5
## CYTB     28865     56344     69231     51625 62797 78611 56111 65895
## TrnE        18       100        97        45    40    42    21    44
## ND6      10003     14229     17203     17455 13394 16649 11436 13828
## ND5      48134     59595     66883     71666 50629 70607 43486 50778
colSums(counts(DESeq.ds))
## SPP1_KO_1 SPP1_KO_2 SPP1_KO_3 SPP1_KO_4      wt_1      wt_2      wt_3      wt_4 
##  15330724  20682764  23315712  19236624  23547362  26726542  20706864  24887319
par(las=2)
colSums(counts(DESeq.ds)) %>% barplot(main = "STAR")

dim(DESeq.ds)
## [1] 49997     8
head(counts(salmon_DESeq.ds))
##              SPP1_KO_1 SPP1_KO_2 SPP1_KO_3 SPP1_KO_4 wt_1 wt_2 wt_3 wt_4
## LOC100048823         0         0         0         0    0    0    0    0
## LOC108168686         0         0         3         0    3    0    0    0
## LOC100861738         0         0         0         5    0    0    0    0
## LOC100861749         0         0         2         0    3    0    2    0
## LOC100043931         0         0         0         0    0    0    0    0
## LOC673652            0         0         0         0    0    0    0    0
colSums(counts(salmon_DESeq.ds))
## SPP1_KO_1 SPP1_KO_2 SPP1_KO_3 SPP1_KO_4      wt_1      wt_2      wt_3      wt_4 
##  15950982  21386879  24139399  19968988  24419887  27858579  21506463  25828594
par(las=2)
colSums(counts(salmon_DESeq.ds)) %>% barplot(main = "Salmon")

dim(salmon_DESeq.ds)
## [1] 49152     8

Removing genes with no counts in all the samples:

keep_genes <- rowSums(counts(DESeq.ds)) > 0

DESeq.ds <- DESeq.ds[ keep_genes, ]

dim(DESeq.ds)
## [1] 29710     8
counts(DESeq.ds) %>% str
##  int [1:29710, 1:8] 422 6 28865 18 10003 48134 37 141 47 21703 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:29710] "TrnP" "TrnT" "CYTB" "TrnE" ...
##   ..$ : chr [1:8] "SPP1_KO_1" "SPP1_KO_2" "SPP1_KO_3" "SPP1_KO_4" ...
assay(DESeq.ds) %>% str
##  int [1:29710, 1:8] 422 6 28865 18 10003 48134 37 141 47 21703 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:29710] "TrnP" "TrnT" "CYTB" "TrnE" ...
##   ..$ : chr [1:8] "SPP1_KO_1" "SPP1_KO_2" "SPP1_KO_3" "SPP1_KO_4" ...
salmon_keep_genes <- rowSums(counts(salmon_DESeq.ds)) > 0

salmon_DESeq.ds <- salmon_DESeq.ds[ salmon_keep_genes, ]

dim(salmon_DESeq.ds)
## [1] 26913     8
counts(salmon_DESeq.ds) %>% str
##  int [1:26913, 1:8] 0 0 0 3 476 0 0 362 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:26913] "LOC108168686" "LOC100861738" "LOC100861749" "Spry3" ...
##   ..$ : chr [1:8] "SPP1_KO_1" "SPP1_KO_2" "SPP1_KO_3" "SPP1_KO_4" ...
assay(salmon_DESeq.ds) %>% str
##  int [1:26913, 1:8] 0 0 0 3 476 0 0 362 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:26913] "LOC108168686" "LOC100861738" "LOC100861749" "Spry3" ...
##   ..$ : chr [1:8] "SPP1_KO_1" "SPP1_KO_2" "SPP1_KO_3" "SPP1_KO_4" ...
DESeq.ds <- estimateSizeFactors(DESeq.ds) # calculate SFs, add them to object
plot( sizeFactors(DESeq.ds), colSums(counts(DESeq.ds)), # assess them
ylab = "library sizes", xlab = "size factors", cex = .6 , main="STAR")

salmon_DESeq.ds <- estimateSizeFactors(salmon_DESeq.ds) # calculate SFs, add them to object
plot(sizeFactors(salmon_DESeq.ds), colSums(counts(salmon_DESeq.ds)), # assess them
ylab = "library sizes", xlab = "size factors", cex = .6, main="Salmon")

## bp of non-normalized
boxplot(log2(counts(DESeq.ds) +1), notch=TRUE,
main = "STAR Non-normalized read counts",
ylab ="log2(read counts)", cex = .6)

## bp of size-factor normalized values
boxplot(log2(counts(DESeq.ds, normalized=TRUE)+1), notch=TRUE,
main = "STAR Size-factor-normalized read counts",
ylab ="log2(read counts)", cex = .6)

## bp of non-normalized
boxplot(log2(counts(salmon_DESeq.ds)+1), notch=TRUE,
main = "Salmon Non-normalized read counts",
ylab ="log2(read counts)", cex = .6)

## bp of size-factor normalized values
boxplot(log2(counts(salmon_DESeq.ds, normalized=TRUE)+1), notch=TRUE,
main = "Salmon Size-factor-normalized read counts",
ylab ="log2(read counts)", cex = .6)

## non-normalized read counts plus pseudocount
log.counts <- log2(counts(DESeq.ds, normalized = FALSE) + 1)

## instead of creating a new object, we could assign the values to a distinct matrix
## within the DESeq.ds object
assay(DESeq.ds, "log.counts") <- log2(counts(DESeq.ds, normalized = FALSE) + 1)

## normalized read counts
assay(DESeq.ds, "log.norm.counts") <- log2(counts(DESeq.ds, normalized=TRUE) + 1)

DESeq.ds[, c("wt_1","wt_2")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "wt_1 vs. wt_2")

DESeq.ds[, c("SPP1_KO_1","SPP1_KO_2")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "SPP1_KO_1 vs SPP1_KO_2")

## non-normalized read counts plus pseudocount
log.counts <- log2(counts(salmon_DESeq.ds, normalized = FALSE) + 1)

## instead of creating a new object, we could assign the values to a distinct matrix
## within the DESeq.ds object
assay(salmon_DESeq.ds, "log.counts") <- log2(counts(salmon_DESeq.ds, normalized = FALSE) + 1)

## normalized read counts
assay(salmon_DESeq.ds, "log.norm.counts") <- log2(counts(salmon_DESeq.ds, normalized=TRUE) + 1)

salmon_DESeq.ds[, c("wt_1","wt_2")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "wt_1 vs. wt_2")

salmon_DESeq.ds[, c("SPP1_KO_1","SPP1_KO_2")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "SPP1_KO_1 vs SPP1_KO_2")

## generate the plot
msd_plot <- vsn::meanSdPlot(assay(DESeq.ds, "log.norm.counts"), 
                            ranks=FALSE, # show the data on the original scale
                            plot = FALSE)
msd_plot$gg + 
  ggtitle("Sequencing depth normalized log2(read counts)") + 
  ylab("standard deviation")

## generate the plot
msd_plot <- vsn::meanSdPlot(assay(salmon_DESeq.ds, "log.norm.counts"), 
                            ranks=FALSE, # show the data on the original scale
                            plot = FALSE)
msd_plot$gg + 
  ggtitle("Sequencing depth normalized log2(read counts)") + 
  ylab("standard deviation")

DESeq.rlog <- rlog(DESeq.ds, blind = TRUE)

par(mfrow=c(1,2))
plot(assay(DESeq.ds, "log.norm.counts")[,1:2], cex=.1,
     main = "size factor and log2-transformed")
## the rlog-transformed counts are stored in the "assay" accessor
plot(assay(DESeq.rlog)[,1:2],
     cex=.1, main = "rlog transformed",
     xlab = colnames(assay(DESeq.rlog[,1])),
     ylab = colnames(assay(DESeq.rlog[,2])) )

salmon_DESeq.rlog <- rlog(salmon_DESeq.ds, blind = TRUE)

par(mfrow=c(1,2))
plot(assay(salmon_DESeq.ds, "log.norm.counts")[,1:2], cex=.1,
     main = "size factor and log2-transformed")
## the rlog-transformed counts are stored in the "assay" accessor
plot(assay(salmon_DESeq.rlog)[,1:2],
     cex=.1, main = "rlog transformed",
     xlab = colnames(assay(salmon_DESeq.rlog[,1])),
     ylab = colnames(assay(salmon_DESeq.rlog[,2])) )

rlog.norm.counts <- assay(DESeq.rlog)
## rlog-transformed read counts
msd_plot <- vsn::meanSdPlot(assay(DESeq.rlog), ranks=FALSE, plot = FALSE)
msd_plot$gg + ggtitle("Following rlog transformation") +
  coord_cartesian(ylim = c(0,3))

salmon_rlog.norm.counts <- assay(salmon_DESeq.rlog)
## rlog-transformed read counts
salmon_msd_plot <- vsn::meanSdPlot(assay(salmon_DESeq.rlog), ranks=FALSE, plot = FALSE)
salmon_msd_plot$gg + ggtitle("Following rlog transformation") +
  coord_cartesian(ylim = c(0,3))

save.image(file = "DESeqReadyFiles.RData")
FILE_DSD="DESeqReadyFiles.RData"
load(FILE_DSD)
DESeq.ds
## class: DESeqDataSet 
## dim: 29710 8 
## metadata(1): version
## assays(3): counts log.counts log.norm.counts
## rownames(29710): TrnP TrnT ... Gm7341 Xkr4
## rowData names(0):
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(2): condition sizeFactor
DESeq.ds$condition
## [1] SPP1_KO SPP1_KO SPP1_KO SPP1_KO wt      wt      wt      wt     
## Levels: SPP1_KO wt
salmon_DESeq.ds
## class: DESeqDataSet 
## dim: 26913 8 
## metadata(1): version
## assays(3): counts log.counts log.norm.counts
## rownames(26913): LOC108168686 LOC100861738 ... 4921531C22Rik Trem3
## rowData names(0):
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(2): condition sizeFactor
salmon_DESeq.ds$condition
## [1] SPP1_KO SPP1_KO SPP1_KO SPP1_KO wt      wt      wt      wt     
## Levels: SPP1_KO wt
DESeq.ds$condition %<>% relevel(ref="wt")
DESeq.ds$condition
## [1] SPP1_KO SPP1_KO SPP1_KO SPP1_KO wt      wt      wt      wt     
## Levels: wt SPP1_KO
salmon_DESeq.ds$condition %<>% relevel(ref="wt")
salmon_DESeq.ds$condition
## [1] SPP1_KO SPP1_KO SPP1_KO SPP1_KO wt      wt      wt      wt     
## Levels: wt SPP1_KO
design(DESeq.ds)
## ~condition
DESeq.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
DESeq.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
DESeq.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
DESeq.ds %<>% nbinomWaldTest()

DESeq.ds
## class: DESeqDataSet 
## dim: 29710 8 
## metadata(1): version
## assays(6): counts log.counts ... H cooks
## rownames(29710): TrnP TrnT ... Gm7341 Xkr4
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(2): condition sizeFactor
design(salmon_DESeq.ds)
## ~condition
salmon_DESeq.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
salmon_DESeq.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
salmon_DESeq.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
salmon_DESeq.ds %<>% nbinomWaldTest()

salmon_DESeq.ds
## class: DESeqDataSet 
## dim: 26913 8 
## metadata(1): version
## assays(6): counts log.counts ... H cooks
## rownames(26913): LOC108168686 LOC100861738 ... 4921531C22Rik Trem3
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(8): SPP1_KO_1 SPP1_KO_2 ... wt_3 wt_4
## colData names(2): condition sizeFactor
rowData(DESeq.ds) %>% colnames
##  [1] "baseMean"                             
##  [2] "baseVar"                              
##  [3] "allZero"                              
##  [4] "dispGeneEst"                          
##  [5] "dispGeneIter"                         
##  [6] "dispFit"                              
##  [7] "dispersion"                           
##  [8] "dispIter"                             
##  [9] "dispOutlier"                          
## [10] "dispMAP"                              
## [11] "Intercept"                            
## [12] "condition_SPP1_KO_vs_wt"              
## [13] "SE_Intercept"                         
## [14] "SE_condition_SPP1_KO_vs_wt"           
## [15] "WaldStatistic_Intercept"              
## [16] "WaldStatistic_condition_SPP1_KO_vs_wt"
## [17] "WaldPvalue_Intercept"                 
## [18] "WaldPvalue_condition_SPP1_KO_vs_wt"   
## [19] "betaConv"                             
## [20] "betaIter"                             
## [21] "deviance"                             
## [22] "maxCooks"
rowData(DESeq.ds)$WaldPvalue_condition_SPP1_KO_vs_wt %>% hist(breaks=19, main="Raw p-values for SPP1_KO vs wt")

rowData(salmon_DESeq.ds) %>% colnames
##  [1] "baseMean"                             
##  [2] "baseVar"                              
##  [3] "allZero"                              
##  [4] "dispGeneEst"                          
##  [5] "dispGeneIter"                         
##  [6] "dispFit"                              
##  [7] "dispersion"                           
##  [8] "dispIter"                             
##  [9] "dispOutlier"                          
## [10] "dispMAP"                              
## [11] "Intercept"                            
## [12] "condition_SPP1_KO_vs_wt"              
## [13] "SE_Intercept"                         
## [14] "SE_condition_SPP1_KO_vs_wt"           
## [15] "WaldStatistic_Intercept"              
## [16] "WaldStatistic_condition_SPP1_KO_vs_wt"
## [17] "WaldPvalue_Intercept"                 
## [18] "WaldPvalue_condition_SPP1_KO_vs_wt"   
## [19] "betaConv"                             
## [20] "betaIter"                             
## [21] "deviance"                             
## [22] "maxCooks"
rowData(salmon_DESeq.ds)$WaldPvalue_condition_SPP1_KO_vs_wt %>% hist(breaks=19, main="Raw p-values for SPP1_KO vs wt")

DGE.results <- results(DESeq.ds, independentFiltering = TRUE, alpha = 0.05)
# the first line will tell you which comparison was done to achieve the log2FC 
head(DGE.results)
## log2 fold change (MLE): condition SPP1 KO vs wt 
## Wald test p-value: condition SPP1 KO vs wt 
## DataFrame with 6 rows and 6 columns
##         baseMean log2FoldChange     lfcSE       stat      pvalue        padj
##        <numeric>      <numeric> <numeric>  <numeric>   <numeric>   <numeric>
## TrnP   452.39676      0.5955603  0.166828  3.5698978 3.57121e-04 0.001950977
## TrnT     2.64012     -0.1183985  1.296677 -0.0913092 9.27247e-01 0.963156134
## CYTB 56565.35021     -0.0764638  0.135832 -0.5629292 5.73483e-01 0.722699399
## TrnE    49.54153      1.0708467  0.417314  2.5660481 1.02865e-02 0.035636123
## ND6  14106.62831      0.3970080  0.120152  3.3042057 9.52459e-04 0.004619597
## ND5  57548.84414      0.5173162  0.128589  4.0230240 5.74556e-05 0.000379555
salmon_DGE.results <- results(salmon_DESeq.ds, independentFiltering = TRUE, alpha = 0.05)
# the first line will tell you which comparison was done to achieve the log2FC 
head(salmon_DGE.results)
## log2 fold change (MLE): condition SPP1 KO vs wt 
## Wald test p-value: condition SPP1 KO vs wt 
## DataFrame with 6 rows and 6 columns
##                baseMean log2FoldChange     lfcSE       stat      pvalue
##               <numeric>      <numeric> <numeric>  <numeric>   <numeric>
## LOC108168686   0.676856      0.0934023  3.359614  0.0278015 0.977820474
## LOC100861738   0.703059      3.1083170  3.444056  0.9025165 0.366782552
## LOC100861749   0.814896     -1.1609272  2.713580 -0.4278212 0.668781280
## Spry3         30.536510     -1.7286235  0.493379 -3.5036419 0.000458942
## Vamp7        729.855253     -0.0455607  0.114357 -0.3984069 0.690330288
## LOC105243599   0.488945     -1.0807886  3.121793 -0.3462077 0.729186621
##                    padj
##               <numeric>
## LOC108168686         NA
## LOC100861738         NA
## LOC100861749         NA
## Spry3        0.00240699
## Vamp7        0.80986940
## LOC105243599         NA
summary(DGE.results)
## 
## out of 29710 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 3344, 11%
## LFC < 0 (down)     : 3053, 10%
## outliers [1]       : 2, 0.0067%
## low counts [2]     : 9216, 31%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(salmon_DGE.results)
## 
## out of 26913 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 3296, 12%
## LFC < 0 (down)     : 3058, 11%
## outliers [1]       : 22, 0.082%
## low counts [2]     : 6783, 25%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# the DESeqResult object can basically be handled like a data.frame
table(DGE.results$padj < 0.05)
## 
## FALSE  TRUE 
## 14095  6397
table(salmon_DGE.results$padj < 0.05)
## 
## FALSE  TRUE 
## 13754  6354
DGE.results$padj %>%
    hist(breaks=19, main="STAR adjusted p-values for SPP1_KO vs WT")

salmon_DGE.results$padj %>%
    hist(breaks=19, main="Salmon adjusted p-values for SPP1_KO vs WT")

DGE.results.sorted <- DGE.results %>% `[`(order(.$padj),)
head(DGE.results.sorted)
## log2 fold change (MLE): condition SPP1 KO vs wt 
## Wald test p-value: condition SPP1 KO vs wt 
## DataFrame with 6 rows and 6 columns
##          baseMean log2FoldChange     lfcSE      stat       pvalue         padj
##         <numeric>      <numeric> <numeric> <numeric>    <numeric>    <numeric>
## Spp1    14917.826       -7.11281 0.1611036  -44.1505  0.00000e+00  0.00000e+00
## Mrc1     1130.756        4.33284 0.1201555   36.0603 9.52727e-285 9.76164e-281
## Cd93     1779.027        4.93173 0.1484807   33.2146 6.61766e-242 4.52031e-238
## Xdh       967.098        3.70743 0.1148878   32.2700 1.84404e-228 9.44704e-225
## Pld4     1303.429        3.32855 0.1110480   29.9739 2.14610e-197 8.79558e-194
## Pik3ap1  1679.039        2.76266 0.0937521   29.4677 7.46436e-191 2.54933e-187
salmon_DGE.results.sorted <- salmon_DGE.results %>% `[`(order(.$padj),)
head(salmon_DGE.results.sorted)
## log2 fold change (MLE): condition SPP1 KO vs wt 
## Wald test p-value: condition SPP1 KO vs wt 
## DataFrame with 6 rows and 6 columns
##          baseMean log2FoldChange     lfcSE      stat       pvalue         padj
##         <numeric>      <numeric> <numeric> <numeric>    <numeric>    <numeric>
## Mrc1      1134.00        4.37009 0.1158709   37.7152  0.00000e+00  0.00000e+00
## Spp1     15131.58       -6.29562 0.1065286  -59.0979  0.00000e+00  0.00000e+00
## Cd93      1760.21        4.95606 0.1494439   33.1634 3.63555e-241 2.43679e-237
## Xdh        965.02        3.70092 0.1128660   32.7904 8.06465e-236 4.05410e-232
## Pik3ap1   1723.48        2.77874 0.0907901   30.6062 1.01213e-205 4.07037e-202
## Pld4      1274.49        3.34293 0.1111628   30.0724 1.11255e-198 3.72854e-195
par(mfrow=c(1,2))
plotCounts(DESeq.ds, gene="Mrc1", normalized = TRUE, xlab="")
plotCounts(DESeq.ds, gene = which.max(DGE.results$padj), xlab="",
           main = "Gene with max. p.adj.\n(=least significant)")

par(mfrow=c(1,2))
plotCounts(salmon_DESeq.ds, gene="Mrc1", normalized = TRUE, xlab="")
plotCounts(salmon_DESeq.ds, gene = which.max(salmon_DGE.results$padj), xlab="",
           main = "Gene with max. p.adj.\n(=least significant)")

# identify genes with the desired adjusted p-value cut-off
DGEgenes <- rownames(subset(DGE.results.sorted, padj < 0.05)) 
# extract rlog-transformed values into a matrix
rlog.dge <- DESeq.rlog[DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(rlog.dge, scale="none",
         show_rownames=FALSE, main="STAR DGE (no scaling)",
         color=colorRampPalette(RColorBrewer::brewer.pal(n=7, name="Reds"))(100))

# identify genes with the desired adjusted p-value cut-off
salmon_DGEgenes <- rownames(subset(salmon_DGE.results.sorted, padj < 0.05)) 
# extract rlog-transformed values into a matrix
salmon_rlog.dge <- salmon_DESeq.rlog[salmon_DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(salmon_rlog.dge, scale="none",
         show_rownames=FALSE, main="Salmon DGE (no scaling)",
         color=colorRampPalette(RColorBrewer::brewer.pal(n=7, name="Reds"))(100))

pheatmap(rlog.dge, scale="row",
         show_rownames=FALSE, main="STAR DGE (row-based z-score)")

pheatmap(salmon_rlog.dge, scale="row",
         show_rownames=FALSE, main="Salmon DGE (row-based z-score)")

DESeq2::plotMA(DGE.results, alpha=0.05,
       main="STAR Test: p.adj.value < 0.05", ylim = c(-4,4))

DESeq2::plotMA(salmon_DGE.results, alpha=0.05,
       main="Salmon Test: p.adj.value < 0.05", ylim = c(-4,4))

vp1.1 <- EnhancedVolcano(DGE.results,
                       lab=rownames(DGE.results),
                       x='log2FoldChange', y='padj',
                       pCutoff=0.05,
                       title="STAR SPP1_KO / wt")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
print(vp1.1)

vp1.2 <- EnhancedVolcano(salmon_DGE.results,
                       lab=rownames(salmon_DGE.results),
                       x='log2FoldChange', y='padj',
                       pCutoff=0.05,
                       title="Salmon SPP1_KO / wt")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
print(vp1.2)

DGE.results.shrnk <- lfcShrink(DESeq.ds,
            coef=2,
            type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resultsNames(DESeq.ds)
## [1] "Intercept"               "condition_SPP1_KO_vs_wt"
salmon_DGE.results.shrnk <- lfcShrink(salmon_DESeq.ds,
            coef=2,
            type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resultsNames(salmon_DESeq.ds)
## [1] "Intercept"               "condition_SPP1_KO_vs_wt"
par(mfrow = c(1,2))
DESeq2::plotMA(DGE.results, alpha=0.05,
       main="no shrinkage", ylim=c(-4,4))
DESeq2::plotMA(DGE.results.shrnk, alpha=0.05,
       main="with logFC shrinkage", ylim=c(-4,4))

par(mfrow = c(1,2))
DESeq2::plotMA(salmon_DGE.results, alpha=0.05,
       main="no shrinkage", ylim=c(-4,4))
DESeq2::plotMA(salmon_DGE.results.shrnk, alpha=0.05,
       main="with logFC shrinkage", ylim=c(-4,4))

vp2.1 <- EnhancedVolcano(DGE.results.shrnk, lab=rownames(DGE.results.shrnk),
                       x='log2FoldChange', y='padj', pCutoff = 0.05,
                       title="with logFC shrinkage")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
vp1.1 + vp2.1

vp2.2 <- EnhancedVolcano(salmon_DGE.results.shrnk, lab=rownames(salmon_DGE.results.shrnk),
                       x='log2FoldChange', y='padj', pCutoff = 0.05,
                       title="with logFC shrinkage")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
vp1.2 + vp2.2

DGE.results %>% `[`(order(.$padj),) %>% head
## log2 fold change (MLE): condition SPP1 KO vs wt 
## Wald test p-value: condition SPP1 KO vs wt 
## DataFrame with 6 rows and 6 columns
##          baseMean log2FoldChange     lfcSE      stat       pvalue         padj
##         <numeric>      <numeric> <numeric> <numeric>    <numeric>    <numeric>
## Spp1    14917.826       -7.11281 0.1611036  -44.1505  0.00000e+00  0.00000e+00
## Mrc1     1130.756        4.33284 0.1201555   36.0603 9.52727e-285 9.76164e-281
## Cd93     1779.027        4.93173 0.1484807   33.2146 6.61766e-242 4.52031e-238
## Xdh       967.098        3.70743 0.1148878   32.2700 1.84404e-228 9.44704e-225
## Pld4     1303.429        3.32855 0.1110480   29.9739 2.14610e-197 8.79558e-194
## Pik3ap1  1679.039        2.76266 0.0937521   29.4677 7.46436e-191 2.54933e-187
salmon_DGE.results %>% `[`(order(.$padj),) %>% head
## log2 fold change (MLE): condition SPP1 KO vs wt 
## Wald test p-value: condition SPP1 KO vs wt 
## DataFrame with 6 rows and 6 columns
##          baseMean log2FoldChange     lfcSE      stat       pvalue         padj
##         <numeric>      <numeric> <numeric> <numeric>    <numeric>    <numeric>
## Mrc1      1134.00        4.37009 0.1158709   37.7152  0.00000e+00  0.00000e+00
## Spp1     15131.58       -6.29562 0.1065286  -59.0979  0.00000e+00  0.00000e+00
## Cd93      1760.21        4.95606 0.1494439   33.1634 3.63555e-241 2.43679e-237
## Xdh        965.02        3.70092 0.1128660   32.7904 8.06465e-236 4.05410e-232
## Pik3ap1   1723.48        2.77874 0.0907901   30.6062 1.01213e-205 4.07037e-202
## Pld4      1274.49        3.34293 0.1111628   30.0724 1.11255e-198 3.72854e-195
STAR_gene_name <- row.names(subset(DGE.results, padj < 0.05))
write.table(cbind(STAR_gene_name,subset(DGE.results, padj < 0.05)),
            file="STAR_DESeq2results_wt-vs-SPP1_KO.txt",
            sep="\t", quote=FALSE, row.names=FALSE)
salmon_gene_name <- row.names(subset(salmon_DGE.results, padj < 0.05))
write.table(cbind(salmon_gene_name,subset(salmon_DGE.results, padj < 0.05)),
            file="Salmon_DESeq2results_wt-vs-SPP1_KO.txt",
            sep="\t", quote=FALSE, row.names=FALSE)